# In order to run the code, uncomment this chunk and install required packages
# install.packages('readxl')
# install.packages('caret')
# install.packages('corrplot')
# install.packages('outliers')
# install.packages('e1071')
# install.packages("randomForest")
# install.packages("rpart.plot")
# install.packages("xgboost")
# install.packages("SHAPforxgboost")
# install.packages("factoextra")
# install.packages("NbClust")
# install.packages("plotly")
library("readxl")
library(corrplot)
library(outliers)
library(caret)
library(e1071)
library("randomForest")
library(rpart)
library(rpart.plot)
library(xgboost)
library("SHAPforxgboost")
library(factoextra)
library(NbClust)
library(clValid)
library(dplyr)
library(plotly)
# Loading data
data <- read_excel("ChurnData.xlsx")
New names:
• `` -> `...18`
head(data)
# Removing NA values
data <- data[,colSums(is.na(data))<nrow(data)]
data <- na.omit(data)
# Removing unnecessary column Customer
data <- subset(data, select = -Customer)
# Removing spaces in column names
colnames(data) <- make.names(names(data))
# Removing 0s in Account.Length column
data[data$Account.Length == 0, ] <- 1
hist(data$Customer.Service.Calls,
main = "Customer service calls distribution",
xlab = "Customer service calls, calls",
xlim = c(0, max(data$Customer.Service.Calls)),
col = "dodgerblue3",
breaks = 10)
axis(side=1, at=seq(0, 9, 1))

# Churners VS Non churners distribution
num_of_churners_non_churners <- c(nrow(data[data$Churn==1, ]),nrow(data[data$Churn==0, ]))
percentage <- round(num_of_churners_non_churners * 100 / sum(num_of_churners_non_churners), 1)
colors <- c("dodgerblue1","dodgerblue4")
pie(num_of_churners_non_churners,
main = "Churners VS Non churners distribution",
labels = percentage,
col = colors)
legend("topright", c("Churners","Non churners"), cex = 1, fill = colors)

par(mfrow=c(3,1))
# Income boxplot
boxplot(data$Income,
scientific = FALSE,
horizontal = TRUE,
ylim = c(0, max(data$Income)),
main = "Income distribution",
xlab = "Annual Income, $",
col="dodgerblue3")
# Account Length boxplot
boxplot(data$Account.Length,
scientific = FALSE,
horizontal = TRUE,
ylim = c(0, max(data$Account.Length)),
main = "Account Length distribution",
xlab = "Account Length, months",
col="dodgerblue3")
# Voice Mail Messages boxplot
boxplot(data$Voice.Mail.Messages,
scientific = FALSE,
horizontal = TRUE,
ylim = c(0, max(data$Voice.Mail.Messages)),
main = "Voice Mail Messages distribution",
xlab = "Voice Mail Messages, messages",
col="dodgerblue3")

par(mfrow=c(3,1))
# Day Minutes boxplot
boxplot(data$Day.Minutes,
scientific = FALSE,
horizontal = TRUE,
ylim = c(0, max(data$Day.Minutes)),
main = "Day Minutes distribution",
xlab = "Day Minutes, min",
col="dodgerblue3")
# Evening Minutes boxplot
boxplot(data$Evening.Minutes,
scientific = FALSE,
horizontal = TRUE,
ylim = c(0, max(data$Evening.Minutes)),
main = "Evening Minutes distribution",
xlab = "Evening Minutes, min",
col="dodgerblue3")
# Night Minutes boxplot
boxplot(data$Night.Minutes,
scientific = FALSE,
horizontal = TRUE,
ylim = c(0, max(data$Night.Minutes)),
main = "Night Minutes distribution",
xlab = "Night Minutes, min",
col="dodgerblue3")

# International Minutes boxplot
boxplot(data$International.Minutes,
scientific = FALSE,
horizontal = TRUE,
ylim = c(0, max(data$International.Minutes)),
main = "International Minutes distribution",
xlab = "International Minutes, min",
col="dodgerblue3")

data_out <- data[, c("Income", "Account.Length", "Day.Minutes", "Day.Calls", "Evening.Minutes", "Evening.Calls", "Night.Minutes", "Night.Calls", "Customer.Service.Calls")]
# Filtering out outliers using z-score method
z_scores <- abs(scale(data_out))
not_outliers <- which(!rowSums(z_scores>3))
data <- data[rownames(data) %in% not_outliers, ]
# Building the correlation plot to see if there is correlation between columns
corrplot(cor(data),
method = "circle",
type = "lower",
tl.col = "black",
tl.cex = 0.6,
bg = "white",
mar=c(0,0,2,0))

# Calculating monthly day, evening and night calls durations
data$dcall_avg <- round(data$Day.Minutes / data$Account.Length, 3)
data$ecall_avg <- round(data$Evening.Minutes / data$Account.Length, 3)
data$ncall_avg <- round(data$Night.Minutes / data$Account.Length, 3)
# Calculating average monthly day, evening and night calls durations for each plan type
dcalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$dcall_avg), mean(data[data$Plan.Type==2, ]$dcall_avg), mean(data[data$Plan.Type==3, ]$dcall_avg), mean(data[data$Plan.Type==4, ]$dcall_avg), mean(data[data$Plan.Type==5, ]$dcall_avg), mean(data[data$Plan.Type==6, ]$dcall_avg), mean(data[data$Plan.Type==7, ]$dcall_avg), mean(data[data$Plan.Type==8, ]$dcall_avg), mean(data[data$Plan.Type==9, ]$dcall_avg), mean(data[data$Plan.Type==10, ]$dcall_avg))
ecalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$ecall_avg), mean(data[data$Plan.Type==2, ]$ecall_avg), mean(data[data$Plan.Type==3, ]$ecall_avg), mean(data[data$Plan.Type==4, ]$ecall_avg), mean(data[data$Plan.Type==5, ]$ecall_avg), mean(data[data$Plan.Type==6, ]$ecall_avg), mean(data[data$Plan.Type==7, ]$ecall_avg), mean(data[data$Plan.Type==8, ]$ecall_avg), mean(data[data$Plan.Type==9, ]$ecall_avg), mean(data[data$Plan.Type==10, ]$ecall_avg))
ncalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$ncall_avg), mean(data[data$Plan.Type==2, ]$ncall_avg), mean(data[data$Plan.Type==3, ]$ncall_avg), mean(data[data$Plan.Type==4, ]$ncall_avg), mean(data[data$Plan.Type==5, ]$ncall_avg), mean(data[data$Plan.Type==6, ]$ncall_avg), mean(data[data$Plan.Type==7, ]$ncall_avg), mean(data[data$Plan.Type==8, ]$ncall_avg), mean(data[data$Plan.Type==9, ]$ncall_avg), mean(data[data$Plan.Type==10, ]$ncall_avg))
# Visualizing average monthly day, evening and night calls durations for each plan type
plan_types <- c("1","2","3","4","5","6","7","8","9","10")
colors <- c("dodgerblue1","dodgerblue3","dodgerblue4")
day_time <- c("Day","Evening","Night")
calls_monthly_avg <- matrix(c(dcalls_monthly_avg, ecalls_monthly_avg, ncalls_monthly_avg), nrow = 3, ncol = 10, byrow = TRUE)
barplot(calls_monthly_avg,
main = "Average monthly calls duration by plan type",
names.arg = plan_types,
xlab = "Plan type",
ylab = "Average monthly calls duration, min",
col = colors)
legend("topright", day_time, cex = 0.6, fill = colors)

# Stratified random split
set.seed(123)
split_index <- createDataPartition(data$Churn, p = .7, list = FALSE, times = 1)
train <- data[split_index,]
test <- data[-split_index,]
train$Churn <- as.factor(train$Churn)
test$Churn <- as.factor(test$Churn)
# Creating additional variables to get rid of multicorrelation
data$dcall_dur_avg <- round(data$Day.Minutes / data$Day.Calls, 3)
data$ecall_dur_avg <- round(data$Evening.Minutes / data$Day.Calls, 3)
data$ncall_dur_avg <- round(data$Night.Minutes / data$Day.Calls, 3)
# Selecting features that will not be used for analysis
dropped_columns <- c("Voice.Mail","Day.Calls","Evening.Calls","Night.Calls","International.Plan.YES","International.Calls","Has.Phone","Phone.Payment.left","dcall_avg","ecall_avg","ncall_avg","dcall_dur_avg","ecall_dur_avg","ncall_dur_avg")
# Dropping unnecessary columns
train <- train[,!(names(train) %in% dropped_columns)]
test <- test[,!(names(test) %in% dropped_columns)]
# Creating dataframe that will later be used to build an ensamble algorithm
data_ens <- data[,!(names(data) %in% dropped_columns)]
# This formula will be used for all models
formula <- 'Churn ~ Income + Gender + Account.Length + Plan.Type + Voice.Mail.Messages + Day.Minutes + Evening.Minutes + Night.Minutes + International.Minutes + Phone.monthly.payment + Customer.Service.Calls'
# Logistic regression
logit <- glm(as.formula(formula),family=binomial(link='logit'),data=train)
summary(logit)
Call:
glm(formula = as.formula(formula), family = binomial(link = "logit"),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7924 -0.5024 -0.3083 -0.1711 2.9670
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -9.497e+00 8.894e-01 -10.678 < 2e-16 ***
Income 5.371e-06 3.475e-06 1.546 0.12218
Gender -9.708e-02 1.845e-01 -0.526 0.59880
Account.Length -2.052e-03 6.452e-03 -0.318 0.75043
Plan.Type -7.364e-03 3.196e-02 -0.230 0.81775
Voice.Mail.Messages -2.268e-02 7.954e-03 -2.851 0.00436 **
Day.Minutes 1.952e-02 1.942e-03 10.053 < 2e-16 ***
Evening.Minutes 8.412e-03 1.987e-03 4.232 2.31e-05 ***
Night.Minutes 4.695e-03 1.891e-03 2.482 0.01305 *
International.Minutes 1.434e-01 1.853e-02 7.736 1.02e-14 ***
Phone.monthly.payment -8.578e-03 4.287e-03 -2.001 0.04540 *
Customer.Service.Calls 4.353e-01 7.082e-02 6.147 7.90e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1019.99 on 1300 degrees of freedom
Residual deviance: 787.78 on 1289 degrees of freedom
AIC: 811.78
Number of Fisher Scoring iterations: 6
# Trained logistic regression predictions
model_fit <- predict(logit, test, type = 'response')
logit_pred <- ifelse(model_fit > 0.5,1,0)
# Confusion matrix for logistic regression
logit_conf <- confusionMatrix(as.factor(logit_pred), test$Churn, mode = "everything", positive = "1")
logit_conf
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 475 56
1 10 16
Accuracy : 0.8815
95% CI : (0.8517, 0.9072)
No Information Rate : 0.8707
P-Value [Acc > NIR] : 0.2462
Kappa : 0.2769
Mcnemar's Test P-Value : 3.04e-08
Sensitivity : 0.22222
Specificity : 0.97938
Pos Pred Value : 0.61538
Neg Pred Value : 0.89454
Precision : 0.61538
Recall : 0.22222
F1 : 0.32653
Prevalence : 0.12926
Detection Rate : 0.02873
Detection Prevalence : 0.04668
Balanced Accuracy : 0.60080
'Positive' Class : 1
logit_f <- logit_conf$byClass[7]
# Algorithm to tune parameters of support vector machines
#
# tune_out <- tune.svm(x = train[, -12], y = train$Churn,
# type = "C-classification",
# kernel = "polynomial", degree = 2, cost = 10^(-1:1),
# gamma = c(0.1, 0.5, 1), coef0 = c(0.1, 0.5, 1))
#
#
# Obtained optimal values will further be used in svm
# tune_out$best.parameters$cost
# tune_out$best.parameters$gamma
# tune_out$best.parameters$coef0
set.seed(123)
# Support vector machines
svm_m = svm(formula = as.formula(formula), data = train, scale = TRUE, type = 'C-classification', kernel = "polynomial", degree = 2, cost = 0.1, gamma = 1, coef0 = 0.5)
set.seed(123)
# Support vector machines predictions
svm_p <- predict(svm_m, newdata = test)
svm_conf <- confusionMatrix(svm_p, test$Churn, mode = "everything", positive = "1")
svm_conf
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 478 37
1 7 35
Accuracy : 0.921
95% CI : (0.8954, 0.942)
No Information Rate : 0.8707
P-Value [Acc > NIR] : 0.0001144
Kappa : 0.5734
Mcnemar's Test P-Value : 1.232e-05
Sensitivity : 0.48611
Specificity : 0.98557
Pos Pred Value : 0.83333
Neg Pred Value : 0.92816
Precision : 0.83333
Recall : 0.48611
F1 : 0.61404
Prevalence : 0.12926
Detection Rate : 0.06284
Detection Prevalence : 0.07540
Balanced Accuracy : 0.73584
'Positive' Class : 1
svm_f <- svm_conf$byClass[7]
# Tuning parameter mtry for random forest
set.seed(123)
train_rf <- as.data.frame(train)
bestmtry <- tuneRF(train_rf[, -12], train_rf[,12], stepFactor=1.5, improve=1e-5, ntree=500)
mtry = 3 OOB error = 8.46%
Searching left ...
mtry = 2 OOB error = 8.99%
-0.06363636 1e-05
Searching right ...
mtry = 4 OOB error = 8.69%
-0.02727273 1e-05

print(bestmtry)
mtry OOBError
2.OOB 2 0.08993082
3.OOB 3 0.08455035
4.OOB 4 0.08685626
set.seed(123)
# Random forest
randomf = randomForest(as.formula(formula), ntree = 500, mtry = 3, data = train)
set.seed(123)
# Random forest predictions
y_pred = predict(randomf, newdata = test[-12])
randf_conf <- confusionMatrix(y_pred, test$Churn, mode = "everything", positive = "1")
importance(randomf)
MeanDecreaseGini
Income 21.051915
Gender 3.779393
Account.Length 17.793486
Plan.Type 12.362275
Voice.Mail.Messages 10.860191
Day.Minutes 96.341691
Evening.Minutes 36.883533
Night.Minutes 26.307368
International.Minutes 26.356792
Phone.monthly.payment 15.413500
Customer.Service.Calls 32.591108
randf_conf
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 479 32
1 6 40
Accuracy : 0.9318
95% CI : (0.9076, 0.9513)
No Information Rate : 0.8707
P-Value [Acc > NIR] : 2.423e-06
Kappa : 0.6419
Mcnemar's Test P-Value : 5.002e-05
Sensitivity : 0.55556
Specificity : 0.98763
Pos Pred Value : 0.86957
Neg Pred Value : 0.93738
Precision : 0.86957
Recall : 0.55556
F1 : 0.67797
Prevalence : 0.12926
Detection Rate : 0.07181
Detection Prevalence : 0.08259
Balanced Accuracy : 0.77159
'Positive' Class : 1
randf_f <- randf_conf$byClass[7]
set.seed(123)
# Simple decision treee
dtree <- rpart(formula = as.formula(formula), data = train, cp=0.01)
# Choosing optimal complexity parameter (cp) using plot
printcp(dtree)
Classification tree:
rpart(formula = as.formula(formula), data = train, cp = 0.01)
Variables actually used in tree construction:
[1] Customer.Service.Calls Day.Minutes Evening.Minutes International.Minutes
[5] Phone.monthly.payment Voice.Mail.Messages
Root node error: 173/1301 = 0.13297
n= 1301
CP nsplit rel error xerror xstd
1 0.167630 0 1.00000 1.00000 0.070793
2 0.063584 1 0.83237 0.87283 0.066781
3 0.057803 3 0.70520 0.82659 0.065214
4 0.023121 4 0.64740 0.74566 0.062312
5 0.011561 10 0.50867 0.80925 0.064609
6 0.010000 11 0.49711 0.79191 0.063996
plotcp(dtree)

rpart.plot(dtree)

set.seed(123)
# Devision tree predictions
tree_pred <- predict(dtree, test, type = 'class')
dtree_conf <- confusionMatrix(tree_pred, test$Churn, mode = "everything", positive = "1")
dtree_conf
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 473 30
1 12 42
Accuracy : 0.9246
95% CI : (0.8994, 0.9451)
No Information Rate : 0.8707
P-Value [Acc > NIR] : 3.499e-05
Kappa : 0.6251
Mcnemar's Test P-Value : 0.008712
Sensitivity : 0.58333
Specificity : 0.97526
Pos Pred Value : 0.77778
Neg Pred Value : 0.94036
Precision : 0.77778
Recall : 0.58333
F1 : 0.66667
Prevalence : 0.12926
Detection Rate : 0.07540
Detection Prevalence : 0.09695
Balanced Accuracy : 0.77930
'Positive' Class : 1
dtree_f <- dtree_conf$byClass[7]
# Preparing data for xgboost algorithm
xgb_train_churn <- as.matrix(train[12])
xgb_train <- data.matrix(train[-12])
xgb_test_churn <- as.matrix(test[12])
xgb_test <- data.matrix(test[-12])
xgb_train_m = xgb.DMatrix(data=xgb_train, label=xgb_train_churn)
xgb_test_m = xgb.DMatrix(data=xgb_test, label=xgb_test_churn)
params <- list(booster = "gbtree", objective = "binary:logistic", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
# Extracting optimal features for xgboost using cross validation
xgbcv <- xgb.cv( params = params, data = xgb_train_m, nrounds = 100, nfold = 5, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 20, maximize = T)
Warning: 'print.every.n' is deprecated.
Use 'print_every_n' instead.
See help("Deprecated") and help("xgboost-deprecated").
Warning: 'early.stop.round' is deprecated.
Use 'early_stopping_rounds' instead.
See help("Deprecated") and help("xgboost-deprecated").
[1] train-logloss:0.490664+0.002389 test-logloss:0.514700+0.002804
Multiple eval metrics are present. Will use test_logloss for early stopping.
Will train until test_logloss hasn't improved in 20 rounds.
[11] train-logloss:0.107408+0.004752 test-logloss:0.237669+0.016557
[21] train-logloss:0.060796+0.003263 test-logloss:0.240536+0.027036
Stopping. Best iteration:
[1] train-logloss:0.490664+0.002389 test-logloss:0.514700+0.002804
# XGBoost algorithm
set.seed(123)
xgb <- xgboost(data = xgb_train_m, params=params, nrounds=41)
[1] train-logloss:0.492093
[2] train-logloss:0.374901
[3] train-logloss:0.302414
[4] train-logloss:0.251825
[5] train-logloss:0.214501
[6] train-logloss:0.184347
[7] train-logloss:0.161494
[8] train-logloss:0.145188
[9] train-logloss:0.129774
[10] train-logloss:0.119050
[11] train-logloss:0.109443
[12] train-logloss:0.101285
[13] train-logloss:0.094453
[14] train-logloss:0.086692
[15] train-logloss:0.084022
[16] train-logloss:0.081549
[17] train-logloss:0.077827
[18] train-logloss:0.072342
[19] train-logloss:0.069526
[20] train-logloss:0.066059
[21] train-logloss:0.062552
[22] train-logloss:0.061327
[23] train-logloss:0.059917
[24] train-logloss:0.057204
[25] train-logloss:0.054538
[26] train-logloss:0.053387
[27] train-logloss:0.051132
[28] train-logloss:0.049806
[29] train-logloss:0.046619
[30] train-logloss:0.045243
[31] train-logloss:0.042654
[32] train-logloss:0.041777
[33] train-logloss:0.040314
[34] train-logloss:0.039193
[35] train-logloss:0.038492
[36] train-logloss:0.037844
[37] train-logloss:0.037258
[38] train-logloss:0.035513
[39] train-logloss:0.034067
[40] train-logloss:0.033318
[41] train-logloss:0.032010
set.seed(123)
# xgboost predictions
xgb_p = predict(xgb, xgb_test_m)
xgb_p = as.factor(round(xgb_p))
xgb_conf <- confusionMatrix(xgb_p, test$Churn, mode = "everything", positive = "1")
xgb_conf
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 474 25
1 11 47
Accuracy : 0.9354
95% CI : (0.9116, 0.9543)
No Information Rate : 0.8707
P-Value [Acc > NIR] : 5.43e-07
Kappa : 0.687
Mcnemar's Test P-Value : 0.03026
Sensitivity : 0.65278
Specificity : 0.97732
Pos Pred Value : 0.81034
Neg Pred Value : 0.94990
Precision : 0.81034
Recall : 0.65278
F1 : 0.72308
Prevalence : 0.12926
Detection Rate : 0.08438
Detection Prevalence : 0.10413
Balanced Accuracy : 0.81505
'Positive' Class : 1
xgb_f <- xgb_conf$byClass[7]
# Importance of features
xgb.importance(model=xgb)
# SHaP values for each feature
shap <- shap.prep(xgb_model = xgb, X_train = xgb_train)
shap.plot.summary(shap)

shap.plot.dependence(shap, "Day.Minutes", alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(shap, "Day.Minutes", color_feature = 'Evening.Minutes', alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'

shap.plot.dependence(shap, "Customer.Service.Calls", color_feature = 'Day.Minutes', alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
pseudoinverse used at -0.025
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
neighborhood radius 2.025
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
reciprocal condition number 5.302e-15
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
There are other near singularities as well. 1

shap.plot.dependence(shap, "International.Minutes", color_feature = 'auto', alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
at -0.0945
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
radius 0.0089303
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
all data on boundary of neighborhood. make span bigger
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
pseudoinverse used at -0.0945
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
neighborhood radius 0.0945
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
reciprocal condition number 1
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
zero-width neighborhood. make span bigger
Warning: Computation failed in `stat_smooth()`:
NA/NaN/Inf in foreign function call (arg 5)

shap.plot.dependence(shap, "Evening.Minutes", color_feature = 'auto', alpha = 0.7, jitter_width = 0.1)
`geom_smooth()` using formula 'y ~ x'

set.seed(123)
# Predicting churn with each model
log_pred <- predict(logit, newdata = data_ens, type = 'response')
log_pred <- as.factor(ifelse(log_pred > 0.5,1,0))
svm_pred <- predict(svm_m, newdata = data_ens)
randf_pred <- predict(randomf, newdata = data_ens)
dtree_pred <- predict(dtree, newdata = data_ens, type = 'class')
xgb_matrix <- xgb.DMatrix(data=data.matrix(data_ens[,-12]), label=as.matrix(data_ens$Churn))
xgb_pred <- predict(xgb, newdata = xgb_matrix)
xgb_pred <- as.factor(round(xgb_pred))
# Building the dataframe for ensemble method
predictions <- data.frame(log_pred, svm_pred, randf_pred, dtree_pred, xgb_pred)
predictions$ens <- NA
# Assigning scores for each model based on respective F-score
sum_f <- sum(logit_f, svm_f, dtree_f, randf_f, xgb_f)
log_score <- (logit_f / sum_f)
svm_score <- (svm_f / sum_f)
randf_score <- (randf_f / sum_f)
dtree_score <- (dtree_f / sum_f)
xgb_score <- (xgb_f / sum_f)
scores <- c(log_score, svm_score, randf_score, dtree_score,xgb_score)
# Ensemble prediction function
for(i in 1:length(log_pred)){
y = 0
n = 0
for (j in 1:5){
if (predictions[i, j] == 0){
n = n + scores[j]
}else{
y = y + scores[j]
}
}
if (y > n){
predictions$ens[i] = 1
}else{
predictions$ens[i] = 0
}
}
# Measuring the performance of an ensemble
ens_conf <- confusionMatrix(as.factor(predictions$ens), as.factor(data_ens$Churn), mode = 'everything', positive = '1')
ens_conf
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1610 81
1 3 164
Accuracy : 0.9548
95% CI : (0.9443, 0.9638)
No Information Rate : 0.8681
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7717
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.66939
Specificity : 0.99814
Pos Pred Value : 0.98204
Neg Pred Value : 0.95210
Precision : 0.98204
Recall : 0.66939
F1 : 0.79612
Prevalence : 0.13186
Detection Rate : 0.08827
Detection Prevalence : 0.08988
Balanced Accuracy : 0.83376
'Positive' Class : 1
ens_f <- ens_conf$byClass[7]
f_scores_all <- c(logit_f, svm_f, dtree_f, randf_f, xgb_f, ens_f)
f_scores_all
F1 F1 F1 F1 F1 F1
0.3265306 0.6140351 0.6666667 0.6779661 0.7230769 0.7961165
# Min max normalization function
minmax_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
# Dataframe consisting only of people who churned
churn_df <- data_ens[data_ens$Churn==1, ]
churn_df <- churn_df[, c("Day.Minutes", "International.Minutes", "Customer.Service.Calls")]
# churn_df <- as.data.frame(lapply(churn_df, minmax_norm))
# churn_df <- churn_df[,!(names(churn_df) %in% c("Plan.Type", "Gender", "Churn"))]
churn_df <- as.data.frame(scale(churn_df))
# pca_res = prcomp(churn_df, center = TRUE, scale = TRUE)
# summary(pca_res)
# Visualizing elbow method
wss <- sapply(1:10, function(k){kmeans(churn_df, k, nstart=25,iter.max = 50 )$tot.withinss})
plot(1:10, wss,
type="b", pch = 19, frame = F,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")

# Defining optimal number of k for k-means algorithm
set.seed(123)
# Silhouette method
fviz_nbclust(churn_df, kmeans, nstart = 25, k.max = 15, iter.max = 50, method = "silhouette")+
labs(subtitle = "Silhouette method")

# K-means with k=4
k4 <- kmeans(churn_df, centers = 4, iter.max = 50, nstart = 25)
k4
K-means clustering with 4 clusters of sizes 36, 115, 45, 49
Cluster means:
Day.Minutes International.Minutes Customer.Service.Calls
1 -1.0930388 1.3095389 0.4343530
2 0.6160607 -0.6653610 -0.4337388
3 0.4143949 1.3927979 -0.7551460
4 -1.0233746 -0.6796489 1.3923433
Clustering vector:
[1] 2 2 1 3 4 2 4 4 2 2 2 4 2 2 3 2 2 3 2 2 1 3 2 1 4 1 2 4 1 2 2 4 2 2 2 2 4 4 2 2 1 3 1 2 3 2 2 2 2 2 1 2
[53] 3 2 3 1 4 2 2 2 1 2 1 2 4 4 4 3 2 2 2 2 2 2 3 3 4 4 3 4 1 4 3 2 4 2 4 4 1 2 2 2 1 4 3 4 4 1 1 3 3 4 4 3
[105] 2 3 4 2 2 4 2 3 2 2 3 3 4 4 2 3 2 3 4 2 3 4 2 2 2 3 2 3 4 4 2 2 3 2 2 2 2 1 2 2 4 1 2 1 2 2 1 2 2 2 2 2
[157] 3 2 1 2 2 1 1 2 3 4 2 1 2 2 2 2 1 4 4 4 2 2 3 4 2 2 3 2 3 3 2 1 4 1 4 2 3 2 2 4 1 4 2 1 2 2 1 4 1 2 1 2
[209] 2 2 3 4 3 3 3 4 2 4 2 3 2 2 2 2 3 1 3 2 4 1 3 2 3 4 1 2 3 2 2 2 2 1 2 2 3
Within cluster sum of squares by cluster:
[1] 51.23809 90.22534 43.70436 29.47427
(between_SS / total_SS = 70.7 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
# Evaluating built clusters with silhouette width
sil <- silhouette(k4$cluster, dist(churn_df))
fviz_silhouette(sil)

churn_df$cluster = factor(k4$cluster)
p <- plot_ly(churn_df, x=~Day.Minutes, y=~International.Minutes,
z=~Customer.Service.Calls, color=~cluster) %>% add_markers(size=1.5)
print(p)
NULL
---
title: "Developing an ensemble approach for predicting customer churn"
output: html_notebook
author: Nazar Todoshchuk
---

```{r}
# In order to run the code, uncomment this chunk and install required packages

# install.packages('readxl')
# install.packages('caret')
# install.packages('corrplot')
# install.packages('outliers')
# install.packages('e1071')
# install.packages("randomForest")
# install.packages("rpart.plot")
# install.packages("xgboost")
# install.packages("SHAPforxgboost")
# install.packages("factoextra")
# install.packages("NbClust")
# install.packages("plotly")
```

```{r}
library("readxl")
library(corrplot)
library(outliers)
library(caret)
library(e1071)
library("randomForest")
library(rpart)
library(rpart.plot)
library(xgboost)
library("SHAPforxgboost")
library(factoextra)
library(NbClust)
library(clValid)
library(dplyr)
library(plotly)
```

```{r}
# Loading data
data <- read_excel("ChurnData.xlsx")
head(data)
```

```{r}
# Removing NA values
data <- data[,colSums(is.na(data))<nrow(data)]
data <- na.omit(data)
# Removing unnecessary column Customer
data <- subset(data, select = -Customer)
# Removing spaces in column names
colnames(data) <- make.names(names(data))
# Removing 0s in Account.Length column
data[data$Account.Length == 0, ] <- 1
```

```{r}
hist(data$Customer.Service.Calls,
    main = "Customer service calls distribution",
    xlab = "Customer service calls, calls",
    xlim = c(0, max(data$Customer.Service.Calls)),
    col = "dodgerblue3",
    breaks = 10)

axis(side=1, at=seq(0, 9, 1))
```

```{r}
# Churners VS Non churners distribution
num_of_churners_non_churners <- c(nrow(data[data$Churn==1, ]),nrow(data[data$Churn==0, ]))
percentage <- round(num_of_churners_non_churners * 100 / sum(num_of_churners_non_churners), 1)
colors <- c("dodgerblue1","dodgerblue4")

pie(num_of_churners_non_churners, 
    main = "Churners VS Non churners distribution",
    labels = percentage,
    col = colors)

legend("topright", c("Churners","Non churners"), cex = 1, fill = colors)

```

```{r}
par(mfrow=c(3,1))
# Income boxplot
boxplot(data$Income,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Income)),
        main = "Income distribution",
        xlab = "Annual Income, $",
        col="dodgerblue3")
# Account Length boxplot
boxplot(data$Account.Length,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Account.Length)),
        main = "Account Length distribution",
        xlab = "Account Length, months",
        col="dodgerblue3")
# Voice Mail Messages boxplot
boxplot(data$Voice.Mail.Messages,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Voice.Mail.Messages)),
        main = "Voice Mail Messages distribution",
        xlab = "Voice Mail Messages, messages",
        col="dodgerblue3")
```

```{r}
par(mfrow=c(3,1))
# Day Minutes boxplot
boxplot(data$Day.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Day.Minutes)),
        main = "Day Minutes distribution",
        xlab = "Day Minutes, min",
        col="dodgerblue3")
# Evening Minutes boxplot
boxplot(data$Evening.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Evening.Minutes)),
        main = "Evening Minutes distribution",
        xlab = "Evening Minutes, min",
        col="dodgerblue3")
# Night Minutes boxplot
boxplot(data$Night.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$Night.Minutes)),
        main = "Night Minutes distribution",
        xlab = "Night Minutes, min",
        col="dodgerblue3")
```

```{r}
# International Minutes boxplot
boxplot(data$International.Minutes,
        scientific = FALSE,
        horizontal = TRUE,
        ylim = c(0, max(data$International.Minutes)),
        main = "International Minutes distribution",
        xlab = "International Minutes, min",
        col="dodgerblue3")
```

```{r}
data_out <- data[, c("Income", "Account.Length", "Day.Minutes", "Day.Calls", "Evening.Minutes", "Evening.Calls", "Night.Minutes", "Night.Calls", "Customer.Service.Calls")]

# Filtering out outliers using z-score method
z_scores <- abs(scale(data_out))

not_outliers <- which(!rowSums(z_scores>3))

data <- data[rownames(data) %in% not_outliers, ]
```

```{r}
# Building the correlation plot to see if there is correlation between columns
corrplot(cor(data),
         method = "circle",
         type = "lower",
         tl.col = "black",
         tl.cex = 0.6,
         bg = "white",
         mar=c(0,0,2,0))
```

```{r}
# Calculating monthly day, evening and night calls durations
data$dcall_avg <- round(data$Day.Minutes / data$Account.Length, 3)
data$ecall_avg <- round(data$Evening.Minutes / data$Account.Length, 3)
data$ncall_avg <- round(data$Night.Minutes / data$Account.Length, 3)
```

```{r}
# Calculating average monthly day, evening and night calls durations for each plan type
dcalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$dcall_avg), mean(data[data$Plan.Type==2, ]$dcall_avg), mean(data[data$Plan.Type==3, ]$dcall_avg), mean(data[data$Plan.Type==4, ]$dcall_avg), mean(data[data$Plan.Type==5, ]$dcall_avg), mean(data[data$Plan.Type==6, ]$dcall_avg), mean(data[data$Plan.Type==7, ]$dcall_avg), mean(data[data$Plan.Type==8, ]$dcall_avg), mean(data[data$Plan.Type==9, ]$dcall_avg), mean(data[data$Plan.Type==10, ]$dcall_avg))

ecalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$ecall_avg), mean(data[data$Plan.Type==2, ]$ecall_avg), mean(data[data$Plan.Type==3, ]$ecall_avg), mean(data[data$Plan.Type==4, ]$ecall_avg), mean(data[data$Plan.Type==5, ]$ecall_avg), mean(data[data$Plan.Type==6, ]$ecall_avg), mean(data[data$Plan.Type==7, ]$ecall_avg), mean(data[data$Plan.Type==8, ]$ecall_avg), mean(data[data$Plan.Type==9, ]$ecall_avg), mean(data[data$Plan.Type==10, ]$ecall_avg))

ncalls_monthly_avg <- c(mean(data[data$Plan.Type==1, ]$ncall_avg), mean(data[data$Plan.Type==2, ]$ncall_avg), mean(data[data$Plan.Type==3, ]$ncall_avg), mean(data[data$Plan.Type==4, ]$ncall_avg), mean(data[data$Plan.Type==5, ]$ncall_avg), mean(data[data$Plan.Type==6, ]$ncall_avg), mean(data[data$Plan.Type==7, ]$ncall_avg), mean(data[data$Plan.Type==8, ]$ncall_avg), mean(data[data$Plan.Type==9, ]$ncall_avg), mean(data[data$Plan.Type==10, ]$ncall_avg))
```

```{r}
# Visualizing average monthly day, evening and night calls durations for each plan type
plan_types <- c("1","2","3","4","5","6","7","8","9","10")
colors <- c("dodgerblue1","dodgerblue3","dodgerblue4")
day_time <- c("Day","Evening","Night")

calls_monthly_avg <- matrix(c(dcalls_monthly_avg, ecalls_monthly_avg, ncalls_monthly_avg), nrow = 3, ncol = 10, byrow = TRUE)

barplot(calls_monthly_avg, 
        main = "Average monthly calls duration by plan type", 
        names.arg = plan_types, 
        xlab = "Plan type", 
        ylab = "Average monthly calls duration, min", 
        col = colors)

legend("topright", day_time, cex = 0.6, fill = colors)

```

```{r}
# Stratified random split
set.seed(123)
split_index <- createDataPartition(data$Churn, p = .7, list = FALSE, times = 1)
train <- data[split_index,]
test <- data[-split_index,]

train$Churn <- as.factor(train$Churn)
test$Churn <- as.factor(test$Churn)
```

```{r}
# Creating additional variables to get rid of multicorrelation
data$dcall_dur_avg <- round(data$Day.Minutes / data$Day.Calls, 3)
data$ecall_dur_avg <- round(data$Evening.Minutes / data$Day.Calls, 3)
data$ncall_dur_avg <- round(data$Night.Minutes / data$Day.Calls, 3)
```

```{r}
# Selecting features that will not be used for analysis
dropped_columns <- c("Voice.Mail","Day.Calls","Evening.Calls","Night.Calls","International.Plan.YES","International.Calls","Has.Phone","Phone.Payment.left","dcall_avg","ecall_avg","ncall_avg","dcall_dur_avg","ecall_dur_avg","ncall_dur_avg")

# Dropping unnecessary columns
train <- train[,!(names(train) %in% dropped_columns)]
test <- test[,!(names(test) %in% dropped_columns)]

# Creating dataframe that will later be used to build an ensamble algorithm
data_ens <- data[,!(names(data) %in% dropped_columns)]
```

```{r}
# This formula will be used for all models
formula <- 'Churn ~ Income + Gender + Account.Length + Plan.Type + Voice.Mail.Messages + Day.Minutes + Evening.Minutes + Night.Minutes + International.Minutes + Phone.monthly.payment + Customer.Service.Calls'
```

```{r}
# Logistic regression
logit <- glm(as.formula(formula),family=binomial(link='logit'),data=train)
summary(logit)
```

```{r}
# Trained logistic regression predictions
model_fit <- predict(logit, test, type = 'response')
logit_pred <- ifelse(model_fit > 0.5,1,0)
# Confusion matrix for logistic regression
logit_conf <- confusionMatrix(as.factor(logit_pred), test$Churn, mode = "everything", positive = "1")
logit_conf
logit_f <- logit_conf$byClass[7]
```

```{r}
# Algorithm to tune parameters of support vector machines
# 
# tune_out <- tune.svm(x = train[, -12], y = train$Churn,
#               type = "C-classification",
#               kernel = "polynomial", degree = 2, cost = 10^(-1:1),
#               gamma = c(0.1, 0.5, 1), coef0 = c(0.1, 0.5, 1))
# 
# 
# Obtained optimal values will further be used in svm
# tune_out$best.parameters$cost
# tune_out$best.parameters$gamma
# tune_out$best.parameters$coef0
```

```{r}
set.seed(123)
# Support vector machines
svm_m = svm(formula = as.formula(formula), data = train, scale = TRUE, type = 'C-classification',  kernel = "polynomial", degree = 2, cost = 0.1,  gamma = 1, coef0 = 0.5)
```

```{r}
set.seed(123)
# Support vector machines predictions
svm_p <- predict(svm_m, newdata = test)
svm_conf <- confusionMatrix(svm_p, test$Churn, mode = "everything", positive = "1")
svm_conf
svm_f <- svm_conf$byClass[7]
```
```{r}
# Tuning parameter mtry for random forest
set.seed(123)
train_rf <- as.data.frame(train)
bestmtry <- tuneRF(train_rf[, -12], train_rf[,12], stepFactor=1.5, improve=1e-5, ntree=500)
print(bestmtry)
```


```{r}
set.seed(123)
# Random forest
randomf = randomForest(as.formula(formula), ntree = 500,  mtry = 3, data = train)
```

```{r}
set.seed(123)
# Random forest predictions
y_pred = predict(randomf, newdata = test[-12])
randf_conf <- confusionMatrix(y_pred, test$Churn, mode = "everything", positive = "1")
importance(randomf)
randf_conf

randf_f <- randf_conf$byClass[7]
```

```{r}
set.seed(123)
# Simple decision treee
dtree <- rpart(formula = as.formula(formula), data = train, cp=0.01)
# Choosing optimal complexity parameter (cp) using plot
printcp(dtree)
plotcp(dtree)

```

```{r}
rpart.plot(dtree)
```

```{r}
set.seed(123)
# Devision tree predictions
tree_pred <- predict(dtree, test, type = 'class')

dtree_conf <- confusionMatrix(tree_pred, test$Churn,  mode = "everything", positive = "1")
dtree_conf
dtree_f <- dtree_conf$byClass[7]
```

```{r}
# Preparing data for xgboost algorithm
xgb_train_churn <- as.matrix(train[12])
xgb_train <- data.matrix(train[-12])

xgb_test_churn <- as.matrix(test[12])
xgb_test <- data.matrix(test[-12])

xgb_train_m = xgb.DMatrix(data=xgb_train, label=xgb_train_churn)
xgb_test_m = xgb.DMatrix(data=xgb_test, label=xgb_test_churn)
```

```{r}
params <- list(booster = "gbtree", objective = "binary:logistic", eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
```

```{r}
# Extracting optimal features for xgboost using cross validation
xgbcv <- xgb.cv( params = params, data = xgb_train_m, nrounds = 100, nfold = 5, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 20, maximize = T)
```

```{r}
# XGBoost algorithm
set.seed(123)
xgb <- xgboost(data = xgb_train_m, params=params, nrounds=41)
```

```{r}
set.seed(123)
# xgboost predictions
xgb_p = predict(xgb, xgb_test_m)
xgb_p = as.factor(round(xgb_p))

xgb_conf <- confusionMatrix(xgb_p, test$Churn,  mode = "everything", positive = "1")
xgb_conf
xgb_f <- xgb_conf$byClass[7]
```

```{r}
# Importance of features
xgb.importance(model=xgb)

# SHaP values for each feature
shap <- shap.prep(xgb_model = xgb, X_train = xgb_train)
shap.plot.summary(shap)
```

```{r}
shap.plot.dependence(shap, "Day.Minutes", alpha = 0.7, jitter_width = 0.1)
shap.plot.dependence(shap, "Day.Minutes", color_feature = 'Evening.Minutes', alpha = 0.7, jitter_width = 0.1)
shap.plot.dependence(shap, "Customer.Service.Calls", color_feature = 'Day.Minutes', alpha = 0.7, jitter_width = 0.1)
```


```{r}
shap.plot.dependence(shap, "International.Minutes", color_feature = 'auto', alpha = 0.7, jitter_width = 0.1)
shap.plot.dependence(shap, "Evening.Minutes", color_feature = 'auto', alpha = 0.7, jitter_width = 0.1)
```


```{r}
set.seed(123)
# Predicting churn with each model
log_pred <- predict(logit, newdata = data_ens, type = 'response')
log_pred <- as.factor(ifelse(log_pred > 0.5,1,0))

svm_pred <- predict(svm_m, newdata = data_ens)

randf_pred <- predict(randomf, newdata = data_ens)

dtree_pred <- predict(dtree, newdata = data_ens, type = 'class')

xgb_matrix <- xgb.DMatrix(data=data.matrix(data_ens[,-12]), label=as.matrix(data_ens$Churn))

xgb_pred <- predict(xgb, newdata = xgb_matrix)
xgb_pred <- as.factor(round(xgb_pred))

# Building the dataframe for ensemble method
predictions <- data.frame(log_pred, svm_pred, randf_pred, dtree_pred, xgb_pred)
predictions$ens <- NA
```

```{r}
# Assigning scores for each model based on respective F-score
sum_f <- sum(logit_f, svm_f, dtree_f, randf_f, xgb_f)
log_score <- (logit_f / sum_f)
svm_score <- (svm_f / sum_f)
randf_score <- (randf_f / sum_f)
dtree_score <- (dtree_f / sum_f)
xgb_score <- (xgb_f / sum_f)

scores <- c(log_score, svm_score, randf_score, dtree_score,xgb_score)
```

```{r}
# Ensemble prediction function
for(i in 1:length(log_pred)){
  y = 0
  n = 0
  for (j in 1:5){
    if (predictions[i, j] == 0){
      n = n + scores[j]
    }else{
      y = y + scores[j]
    }
  }
  if (y > n){
    predictions$ens[i] = 1
  }else{
    predictions$ens[i] = 0
  }
}
```

```{r}
# Measuring the performance of an ensemble
ens_conf <- confusionMatrix(as.factor(predictions$ens), as.factor(data_ens$Churn), mode = 'everything', positive = '1')
ens_conf
ens_f <- ens_conf$byClass[7]

f_scores_all <- c(logit_f, svm_f, dtree_f, randf_f, xgb_f, ens_f)
f_scores_all
```

```{r}
# Min max normalization function
minmax_norm <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}
```

```{r}
# Dataframe consisting only of people who churned
churn_df <- data_ens[data_ens$Churn==1, ]

churn_df <- churn_df[, c("Day.Minutes", "International.Minutes", "Customer.Service.Calls")]

# churn_df <- as.data.frame(lapply(churn_df, minmax_norm))

# churn_df <- churn_df[,!(names(churn_df) %in% c("Plan.Type", "Gender", "Churn"))]

churn_df <- as.data.frame(scale(churn_df))
```

```{r}
# pca_res = prcomp(churn_df, center = TRUE, scale = TRUE)
# summary(pca_res)
```


```{r}
# Visualizing elbow method

wss <- sapply(1:10, function(k){kmeans(churn_df, k, nstart=25,iter.max = 50 )$tot.withinss})

plot(1:10, wss,
     type="b", pch = 19, frame = F,
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")
```

```{r}
# Defining optimal number of k for k-means algorithm
set.seed(123)
# Silhouette method
fviz_nbclust(churn_df, kmeans, nstart = 25, k.max = 15, iter.max = 50, method = "silhouette")+
  labs(subtitle = "Silhouette method")

```

```{r}
# K-means with k=4
k4 <- kmeans(churn_df, centers = 4, iter.max = 50, nstart = 25)
k4
```
```{r}
# Evaluating built clusters with silhouette width
sil <- silhouette(k4$cluster, dist(churn_df))
fviz_silhouette(sil)
```

```{r}
churn_df$cluster = factor(k4$cluster)

p <- plot_ly(churn_df, x=~Day.Minutes, y=~International.Minutes, 
z=~Customer.Service.Calls, color=~cluster) %>% add_markers(size=1.5)
print(p)
```

